{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "27365a01-6f35-4970-8f45-6fc6635d8b5d",
   "metadata": {},
   "source": [
    "# Lesson 5 - Facial Similarity Search"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4aea4f8",
   "metadata": {},
   "source": [
    "### Import the Needed Packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67b8d86b-f3cd-4a8e-a03d-20f68e8e8d8a",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4928bbe6-7f9f-4ea9-b87a-0c79e041a3d8",
   "metadata": {
    "height": 267
   },
   "outputs": [],
   "source": [
    "from deepface import DeepFace\n",
    "from pinecone import Pinecone, ServerlessSpec\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n",
    "from tqdm import tqdm\n",
    "from DLAIUtils import Utils\n",
    "\n",
    "\n",
    "import contextlib\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8234494-5360-4738-b349-6ed83953a01e",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "# get api key\n",
    "utils = Utils()\n",
    "PINECONE_API_KEY = utils.get_pinecone_api_key()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aca8a39b",
   "metadata": {},
   "source": [
    "### Load the Dataset\n",
    "\n",
    "**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):\n",
    "\n",
    "#!wget -q --show-progress -O family_photos.zip \"https://www.dropbox.com/scl/fi/yg0f2ynbzzd2q4nsweti5/family_photos.zip?rlkey=00oeuiii3jgapz2b1bfj0vzys&dl=0\"\n",
    "\n",
    "#!unzip -q family_photos.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f68f1234-c359-4768-8e0f-1ade09d82e4e",
   "metadata": {
    "height": 114
   },
   "outputs": [],
   "source": [
    "def show_img(f):\n",
    "  img = plt.imread(f)\n",
    "  plt.figure(figsize=(4,3))\n",
    "  plt.imshow(img)\n",
    "\n",
    "show_img('family/dad/P06260_face5.jpg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b60e00de-9837-4564-8158-914d76d7ab7d",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "show_img('family/mom/P04407_face2.jpg')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27de473b-fe23-4c9b-a79d-39a4637a9624",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "show_img('family/child/P04414_face1.jpg')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34f8f2c7",
   "metadata": {},
   "source": [
    "### Setup Pinecone "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e1022c1-7d74-4c84-b67f-6af9e76dc189",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "MODEL = \"Facenet\"\n",
    "INDEX_NAME = utils.create_dlai_index_name('dl-ai')\n",
    "\n",
    "pinecone = Pinecone(api_key=PINECONE_API_KEY)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0125e51e",
   "metadata": {},
   "source": [
    "### Create Embeddings Using DeepFace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ee222b3-55f2-429f-a652-ecaece3ecdff",
   "metadata": {
    "height": 301
   },
   "outputs": [],
   "source": [
    "def generate_vectors():\n",
    "  VECTOR_FILE = \"./vectors.vec\"\n",
    "\n",
    "  with contextlib.suppress(FileNotFoundError):\n",
    "    os.remove(VECTOR_FILE)\n",
    "  with open(VECTOR_FILE, \"w\") as f:\n",
    "    for person in [\"mom\", \"dad\", \"child\"]:\n",
    "      files = glob.glob(f'family/{person}/*')\n",
    "      for file in tqdm(files):\n",
    "        try:\n",
    "          embedding = DeepFace.represent(img_path=file, model_name=MODEL, enforce_detection=False)[0]['embedding']\n",
    "          f.write(f'{person}:{os.path.basename(file)}:{embedding}\\n')\n",
    "        except (ValueError, UnboundLocalError, AttributeError) as e:\n",
    "          print(e)\n",
    "\n",
    "generate_vectors()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23ffd65b-3d67-456d-b847-ffddb43f16c6",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "!head -10 vectors.vec"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9fc984a",
   "metadata": {},
   "source": [
    "### Plot the Data of Images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4234c0b8-5a2b-4092-a741-eb8af41e325e",
   "metadata": {
    "height": 267
   },
   "outputs": [],
   "source": [
    "def gen_tsne_df(person, perplexity):\n",
    "    vectors =[]\n",
    "    with open('./vectors.vec', 'r') as f:\n",
    "      for line in tqdm(f):\n",
    "        p, orig_img, v = line.split(':')\n",
    "        if person == p:\n",
    "            vectors.append(eval(v))\n",
    "    pca = PCA(n_components=8)\n",
    "    tsne = TSNE(2, perplexity=perplexity, random_state = 0, n_iter=1000,\n",
    "        verbose=0, metric='euclidean', learning_rate=75)\n",
    "    print(f'transform {len(vectors)} vectors')\n",
    "    pca_transform = pca.fit_transform(vectors)\n",
    "    embeddings2d = tsne.fit_transform(pca_transform)\n",
    "    return pd.DataFrame({'x':embeddings2d[:,0], 'y':embeddings2d[:,1]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39436e5f-56e6-423b-8f4b-b800ec3892bc",
   "metadata": {
    "height": 335
   },
   "outputs": [],
   "source": [
    "def plot_tsne(perplexity, model):\n",
    "    (_, ax) = plt.subplots(figsize=(8,5))\n",
    "    #plt.style.use('seaborn-whitegrid')\n",
    "    plt.grid(color='#EAEAEB', linewidth=0.5)\n",
    "    ax.spines['top'].set_color(None)\n",
    "    ax.spines['right'].set_color(None)\n",
    "    ax.spines['left'].set_color('#2B2F30')\n",
    "    ax.spines['bottom'].set_color('#2B2F30')\n",
    "    colormap = {'dad':'#ee8933', 'child':'#4fad5b', 'mom':'#4c93db'}\n",
    "\n",
    "    for person in colormap:\n",
    "        embeddingsdf = gen_tsne_df(person, perplexity)\n",
    "        ax.scatter(embeddingsdf.x, embeddingsdf.y, alpha=.5, \n",
    "                   label=person, color=colormap[person])\n",
    "    plt.title(f'Scatter plot of faces using {model}', fontsize=16, fontweight='bold', pad=20)\n",
    "    plt.suptitle(f't-SNE [perplexity={perplexity}]', y=0.92, fontsize=13)\n",
    "    plt.legend(loc='best', frameon=True)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "975f8f05-4c96-4b45-8ced-161ae979d704",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "plot_tsne(44, 'facenet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c929b408",
   "metadata": {},
   "source": [
    "### Store the Embeddings in Pinecone"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1188b12-93df-4c66-a19c-dd077883f391",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:\n",
    "  pinecone.delete_index(INDEX_NAME)\n",
    "pinecone.create_index(name=INDEX_NAME, dimension=128, metric='cosine',\n",
    "  spec=ServerlessSpec(cloud='aws', region='us-west-2'))\n",
    "\n",
    "index = pinecone.Index(INDEX_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aeef29aa-8c3a-458e-b0fd-1caf8f54ba00",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "def store_vectors():\n",
    "  with open(\"vectors.vec\", \"r\") as f:\n",
    "    for line in tqdm(f):\n",
    "        person, file, vec = line.split(':')\n",
    "        index.upsert([(f'{person}-{file}', eval(vec), {\"person\":person, \"file\":file})])\n",
    "store_vectors()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "511d9016-7f72-4707-9f6d-c0035b216837",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c04d6854",
   "metadata": {},
   "source": [
    "### Calculate the Similarity Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8aa0bc2-e251-4142-b738-6d374cfe3445",
   "metadata": {
    "height": 301
   },
   "outputs": [],
   "source": [
    "def test(vec_groups, parent, child):\n",
    "  index = pinecone.Index(INDEX_NAME)\n",
    "  parent_vecs = vec_groups[parent]\n",
    "  K = 10\n",
    "  SAMPLE_SIZE = 10\n",
    "  sum = 0\n",
    "  for i in tqdm(range(0,SAMPLE_SIZE)):\n",
    "    query_response = index.query(\n",
    "      top_k=K,\n",
    "      vector = parent_vecs[i],\n",
    "      filter={\n",
    "        \"person\": {\"$eq\": child}\n",
    "      }\n",
    "    )\n",
    "    for row in query_response[\"matches\"]:\n",
    "      sum  = sum + row[\"score\"]\n",
    "  print(f'\\n\\n{parent} AVG: {sum / (SAMPLE_SIZE*K)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9048d9d-e323-451d-b5c9-a411a8c0a202",
   "metadata": {
    "height": 233
   },
   "outputs": [],
   "source": [
    "def compute_scores():\n",
    "  index = pinecone.Index(INDEX_NAME)\n",
    "  vec_groups = {\"dad\":[], \"mom\":[], \"child\":[]}\n",
    "  with open(\"vectors.vec\", \"r\") as f:\n",
    "    for line in tqdm(f):\n",
    "      person, file, vec = line.split(':')\n",
    "      vec_groups[person].append(eval(vec))\n",
    "  print(f\"DAD {'-' * 20}\")\n",
    "  test(vec_groups, \"dad\", \"child\")\n",
    "  print(f\"MOM {'-' * 20}\")\n",
    "  test(vec_groups, \"mom\", \"child\")\n",
    "\n",
    "compute_scores()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34682e8b",
   "metadata": {},
   "source": [
    "### Check the Matching Images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55855dea-4cfb-4052-a341-1cc982ed8c14",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "child_base = 'family/child/P06310_face1.jpg'\n",
    "show_img(child_base)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a9eecf2-e9c1-49aa-a815-5ebf9974eb26",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "#Now find closest given we know dad is \"most similar\"\n",
    "embedding = DeepFace.represent(img_path=child_base, model_name=MODEL)[0]['embedding']\n",
    "print(embedding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6c58cf9-b04c-42af-ae45-11d9cf94bbb1",
   "metadata": {
    "height": 148
   },
   "outputs": [],
   "source": [
    "query_response = index.query(\n",
    "      top_k=3,\n",
    "      vector = embedding,\n",
    "      filter={\n",
    "        \"person\": {\"$eq\": \"dad\"}\n",
    "      },\n",
    "      include_metadata=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "159d27c3-b7f3-4d19-8e38-f02aa5fbb38a",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "print(query_response)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4299b565-4937-48d3-af9a-ab7fdb4e33f6",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "photo = query_response['matches'][0]['metadata']['file']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ba8ca21-6233-466f-8ea3-e4079bda5809",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "show_img(f'family/dad/{photo}')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
